import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
import seaborn as sns; sns.set(color_codes=True)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import sys
sys.setrecursionlimit(5000)
mpl.style.use('default')
import shelve
filename='./shelve_TAD_3dBrowser_analysis.out'
my_shelf = shelve.open(filename,'n') # 'n' for new
shelved = []
error = []
for key in dir():
if not(key.startswith('_')):
try:
my_shelf[key] = globals()[key]
shelved.append(str(key))
except:
error.append(str(key))
my_shelf.close()
print("## Successfully Shelved: ##")
_ = [print(str(i)) for i in shelved]
print("\n## ERROR Shelving: ##")
_ = [print(str(i)) for i in error]
import shelve
filename='./shelve_TAD_3dBrowser_analysis.out'
my_shelf = shelve.open(filename)
opened = []
error = []
for key in my_shelf:
try:
globals()[key]=my_shelf[key]
opened.append(str(key))
except:
error.append(str(key))
my_shelf.close()
print("## Successfully Opened: ##")
_ = [print(str(i)) for i in opened]
print("\n## ERROR Shelving: ##")
_ = [print(str(i)) for i in error]
import csv
with open('/dors/capra_lab/users/evonne/TAD/data/3dGenomeBrowser_cellTypes_fileLocs_ordered.txt') as f:
reader = csv.reader(f, delimiter = "\t")
input_file_list = list(reader)
cell_type_tads = {}
for line in input_file_list:
cell_type_tads[line[1]] = pd.read_table(line[0], header=None)
cell_type_tads[line[1]].columns = ['chr','start','stop']
# access data frames in the dictionary like this: cell_type_tads['bladder'].head()
sum = 0
num_of_TADs = []
for key in cell_type_tads:
print(key + ": " + str(len(cell_type_tads[key])))
sum = sum + len(cell_type_tads[key])
num_of_TADs.append(len(cell_type_tads[key]))
print("\n\n\n" + str(sum) + " number of NON unique TADs over 37 unique tissue/cell types")
_ = plt.hist(num_of_TADs, bins=15)
plt.show()
sns.set_palette(sns.husl_palette(37,l=.6))
ax = plt.subplot(111)
diff_matrix = pd.DataFrame()
for key in cell_type_tads:
diff = pd.DataFrame(cell_type_tads[key]['stop'] - cell_type_tads[key]['start'])
diff.columns = [key]
ax = diff.plot.kde(xlim =(0,5000000),figsize=(10,6), title='Zoom-ed in TAD size by tissue/cell type', ax=ax)
diff_matrix = pd.concat([diff_matrix,diff],axis=1)
ax.legend(loc='upper center', bbox_to_anchor=(1.2, 1))
plt.show()
mpl.style.use('default')
g = sns.set(rc={'figure.figsize':(50,4)})
g = sns.violinplot(data=diff_matrix,figsize=(60,6))
g = plt.xticks(rotation=45)
g = plt.ylim(0,5000000)
def betweenTADsize(data_frame): # input: BED file coordinates
diff = np.array([])
for i in range(len(data_frame) - 1):
if data_frame.loc[i + 1,'chr'] == data_frame.loc[i,'chr']:
difference = data_frame.loc[i + 1,'start'] - data_frame.loc[i,'stop']
diff = np.append(diff, difference)
return(pd.DataFrame(diff)) # returns data frame of the basepair size between TAD boundaries
tads_merged = pd.DataFrame(columns = ['chr','start','stop'])
for key in cell_type_tads:
tads_merged = pd.merge(tads_merged,cell_type_tads[key],how='outer',sort=True)
tads_merged = tads_merged.drop_duplicates() # there shouldnt be any duplicates but drop them if there are
print(str(len(tads_merged)) + " number of UNIQUE TADs of 37 unique tissue/cell types if given no \"buffer\" for the start/stop location")
sns.set_palette(sns.husl_palette(37,l=.6))
ax = plt.subplot(111)
diff_matrix = pd.DataFrame()
for key in cell_type_tads:
diff = betweenTADsize(cell_type_tads[key])
diff.columns = [key]
ax = diff.plot.kde(xlim = (0,800000), figsize=(10,6), title='Zoom-ed in interTAD distances by tissue/cell type', ax=ax)
diff_matrix = pd.concat([diff_matrix,diff],axis=1)
#diff.describe()
ax.legend(loc='upper center', bbox_to_anchor=(1.2, 1))
plt.show()
mpl.style.use('default')
g = sns.set(rc={'figure.figsize':(50,4)})
g = sns.violinplot(data=diff_matrix,figsize=(60,6))
g = plt.xticks(rotation=45)
g = plt.ylim(0,500000)
Note: some of the TAD boundaries probably truely represent the same TAD boundary but start/stop at a slightly different spot
def createMasterTADList(buffer, tads_merged):
## Input:
# buffer: integer which is the window of the buffer (if you give 200KB you add 100kb and subtract 100kb to look for other TAD boundaries)
# tads_merged: a list of all the TAD boundaries merged
buffer = buffer*2
tmp = tads_merged.copy(deep=True)
TAD_master = []
while len(tmp) != 0:
start = tmp.loc[0,'start']
stop = tmp.loc[0,'stop']
chrom = tmp.loc[0,'chr']
start_min = start - buffer
start_max = start + buffer
stop_min = stop - buffer
stop_max = stop + buffer
tmp.drop(tmp[(tmp['chr'] == chrom) & (tmp['start'] > start_min) & (tmp['start'] < start_max) & (tmp['stop'] > stop_min) & (tmp['stop'] < stop_max)].index, inplace=True)
tmp = tmp.reset_index(drop=True)
TAD_master.append([chrom,start_min,start_max,stop_min,stop_max])
del tmp
TAD_master = pd.DataFrame(np.vstack(TAD_master))
TAD_master.columns = ['chr','start_min','start_max','stop_min','stop_max']
TAD_master = TAD_master.apply(pd.to_numeric, errors = 'ignore')
return(TAD_master)
# Note: this may take some time to run
buffer = 15000
TAD_master_30k = createMasterTADList(buffer,tads_merged)
print(str(len(TAD_master_30k)) + " number of UNIQUE TADs of 37 unique tissue/cell types if given a " + str(buffer) + "bp \"buffer\" for the start/stop location")
buffer = 1
TAD_master = createMasterTADList(buffer,tads_merged)
print(str(len(TAD_master)) + " number of UNIQUE TADs of 37 unique tissue/cell types if given a " + str(buffer) + "bp \"buffer\" for the start/stop location")
buffer = 50000
TAD_master_100k = createMasterTADList(buffer,tads_merged)
print(str(len(TAD_master_100k)) + " number of UNIQUE TADs of 37 unique tissue/cell types if given a " + str(buffer) + "bp \"buffer\" for the start/stop location")
TAD_master_30k.head()
def findTADinCellTypes(data_frame, stringOfCellTypeName, TAD_total):
## Input:
# data_frame: BED file coordinates data frame - cell type specific
# stringOfCellTypeName: string describing the cell type name
# TAD_total: dataframe of all merged TADs - the "master" list
TAD_total[stringOfCellTypeName] = 0
for i in range(len(data_frame)):
index = TAD_total[(TAD_total['chr'] == data_frame.loc[i,'chr']) & (TAD_total['start_min'] < data_frame.loc[i,'start']) & (TAD_total['start_max'] > data_frame.loc[i,'start']) & (TAD_total['stop_min'] < data_frame.loc[i,'stop']) & (TAD_total['stop_max'] > data_frame.loc[i,'stop'])].index
TAD_total.loc[index,stringOfCellTypeName] = 1
return TAD_total
#Note: this may take some time to run
#findTADinCellTypes(cell_type_tads['bladder'],'bladder',TAD_master).head()
for key in cell_type_tads:
TAD_master_30k = findTADinCellTypes(cell_type_tads[key], key, TAD_master_30k)
TAD_master = findTADinCellTypes(cell_type_tads[key], key, TAD_master)
TAD_master_100k = findTADinCellTypes(cell_type_tads[key], key, TAD_master_100k)
#g = sns.clustermap(TAD_master.drop(['chr','start_min','start_max','stop_min','stop_max'],axis=1), figsize=(14,14))
#g = sns.clustermap(TAD_master_30k.drop(['chr','start_min','start_max','stop_min','stop_max'],axis=1), figsize=(14,14))
#g = sns.clustermap(TAD_master_100k.drop(['chr','start_min','start_max','stop_min','stop_max'],axis=1), figsize=(14,14))
different clustering method:
# Correlation matrix between tissue types, two different ways
corr = TAD_master.drop(['chr','start_min','start_max','stop_min','stop_max'],axis=1).corr()
corr30k = TAD_master_30k.drop(['chr','start_min','start_max','stop_min','stop_max'],axis=1).corr()
corr100k = TAD_master_100k.drop(['chr','start_min','start_max','stop_min','stop_max'],axis=1).corr()
g = sns.clustermap(corr, cmap="BuPu")
g = plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) # rotating the y-axis labels
g = sns.clustermap(corr30k, cmap="BuPu")
g = plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) # rotating the y-axis labels
g = sns.clustermap(corr100k, cmap="BuPu")
g = plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) # rotating the y-axis labels
Bladder + Adrenal ? Right Ventricle & Left Ventricle - good sanity check, also aorta is nearby and psoas which i would expect (muscular tissues) The lieberman samples are all close - an artifact or makes sense? Dixon data as well some of the cancers are close but some are also part of an outgroup ? (rhabdomyosarcoma)
def chromTADcorr(chrom_str, chr_length, TAD_master, tissue_of_interest, win):
total_corr = []
slide = int(win/5)
for i in range(slide,chr_length,slide):
low = i - (win/2)
high = i + (win/2)
corr = TAD_master[(TAD_master['chr'] == chrom_str) & (((TAD_master['start_min'] + TAD_master['stop_max'])/2) > low) & (((TAD_master['start_min'] + TAD_master['stop_max'])/2) < high)].drop(['chr','start_min','start_max','stop_min','stop_max'],axis=1).corr().loc[tissue_of_interest]
total_corr.append(corr)
total_corr = pd.DataFrame(np.vstack(total_corr))
total_corr.columns = TAD_master.drop(['chr','start_min','start_max','stop_min','stop_max'],axis=1).columns
return(total_corr)
chr1_len = 249250621
win = 5000000
total_corr = chromTADcorr('chr1', chr1_len, TAD_master_30k, 'rightVentricle_schmitt2016', win)
sns.set_palette(sns.husl_palette(37,l=.6))
ax = plt.subplot(111)
ax = total_corr.plot(figsize=(50,6),title='Chr1',ax=ax)
ax.legend(loc='upper center', bbox_to_anchor=(1, 1))
mpl.style.use('default')
Next steps idea:
* Are there cell/tissue-specific genes in areas where TADs are unique for that cell/tissue type?
* Take correlation of all TAD/CTCF structure at multiple different locations
* If the structure is very conserved (high correlation) for all except 1 or 2 tissue types
* Find genes within that TAD (near that boundary) and see if they might be cell/tissue specific - could do this by looking at gene/phenotype ontology
def findDistofConservedTADs(TAD_master, name_of_dataset, percentile):
unique = []
for i in range(1,38):
unique.append([i,len(TAD_master[(TAD_master.iloc[:,5:].sum(axis=1) == i)])])
plt.bar(np.array(unique)[:,0], np.array(unique)[:,1])
plt.title(name_of_dataset)
plt.show()
b = np.cumsum(np.array(unique)[:,1])/np.sum(np.array(unique)[:,1]) * 100
print("The "+ str(percentile) + "th percentile of conservation of TADs (in the dataset: " + name_of_dataset + ") is TADs that are seen in " + str(len(b[b < percentile])) + "+ tissue types.")
return(unique)
unique = findDistofConservedTADs(TAD_master, 'No window', 99)
unique = findDistofConservedTADs(TAD_master_30k, '30k window', 99)
unique = findDistofConservedTADs(TAD_master_100k, '100k window', 99)
unique[23:]
# Just to start with, gives 38 regions
ultraconservedSubset = TAD_master_30k[(TAD_master_30k.iloc[:,5:].sum(axis=1) > 10)].loc[:,['chr','start_min','stop_max']]
ultraconservedSubset.to_csv('ultraconservedSubset.bed',sep='\t',index=False,header=False)
uniqueTAD = TAD_master_100k[(TAD_master_100k.iloc[:,5:].sum(axis=1) <= 1) & (TAD_master_100k.loc[:,'rightVentricle'] == 1)].loc[:,['chr','start_max','stop_min']]
uniqueTAD.to_csv('unique_rightVentricle.bed',sep='\t',index=False,header=False)
uniqueTAD = TAD_master_100k[(TAD_master_100k.iloc[:,5:].sum(axis=1) <= 1) & (TAD_master_100k.loc[:,'H1_ESC_Dixon2015'] == 1)].loc[:,['chr','start_max','stop_min']]
uniqueTAD.to_csv('unique_H1_ESC_Dixon2015.bed',sep='\t',index=False,header=False)